{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Processing\n",
    "### Process data into preferred format\n",
    "* NER Dataset \n",
    "* WNUT17 datast from CorNLL 2017\n",
    "\n",
    "### Retrieve list of word and tags\n",
    "* Words is stored as a list of words in a text file\n",
    "* Tags is stored as a list of tags in a text file"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Final format of training data\n",
    "* In a text file\n",
    "* Each line is a pair of 2 sentence: seqeucen of words - sequence of tags"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## import dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### NER Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sentence #</th>\n",
       "      <th>Word</th>\n",
       "      <th>POS</th>\n",
       "      <th>Tag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Sentence: 1</td>\n",
       "      <td>Thousands</td>\n",
       "      <td>NNS</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Sentence: 1</td>\n",
       "      <td>of</td>\n",
       "      <td>IN</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Sentence: 1</td>\n",
       "      <td>demonstrators</td>\n",
       "      <td>NNS</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Sentence: 1</td>\n",
       "      <td>have</td>\n",
       "      <td>VBP</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Sentence: 1</td>\n",
       "      <td>marched</td>\n",
       "      <td>VBN</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Sentence #           Word  POS Tag\n",
       "0  Sentence: 1      Thousands  NNS   O\n",
       "1  Sentence: 1             of   IN   O\n",
       "2  Sentence: 1  demonstrators  NNS   O\n",
       "3  Sentence: 1           have  VBP   O\n",
       "4  Sentence: 1        marched  VBN   O"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv('./ner_dataset.csv', encoding = 'latin1')\n",
    "data = data.fillna(method=\"ffill\")\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tags:  ['O', 'B-eve', 'I-per', 'I-tim', 'I-gpe', 'I-org', 'I-nat', 'B-per', 'I-art', 'B-art', 'B-org', 'B-tim', 'I-eve', 'B-nat', 'B-geo', 'I-geo', 'B-gpe']\n"
     ]
    }
   ],
   "source": [
    "# inspect gags\n",
    "tags = list(set(data[\"Tag\"].values))\n",
    "n_tags = len(tags);\n",
    "print(\"Tags: \", tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# simplify tags to match with WNUT 17 dataset\n",
    "data['Tag'] = data['Tag'].apply(lambda x : re.sub('gpe', 'geo', x) if 'gpe' in x else x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# concatenate sentences together\n",
    "agg_func = lambda input : [' '.join(input['Word'].values.tolist()), ' '.join(input['Tag'].values.tolist())]\n",
    "grouped_data = data.groupby('Sentence #').apply(agg_func).reset_index().rename(columns = {0 : 'Sentence'})['Sentence']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save words and labels\n",
    "with open('ner_text_dataset.txt', 'w') as file:\n",
    "    file.writelines('\\n'.join([data[0] for data in grouped_data]))\n",
    "with open('ner_label_dataset.txt', 'w') as file:\n",
    "    file.writelines('\\n'.join([data[1] for data in grouped_data]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Retrive list of words**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "words = list(set(data['Word']))\n",
    "tags = list(set(data['Tag']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### WNUT 17 Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path to data\n",
    "file = os.path.join(os.getcwd(), 'emerging_entities_17', 'wnut17train.conll')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read data\n",
    "with open(file, 'r') as f:\n",
    "    df = f.read()\n",
    "    df = df.split('\\n\\t\\n') # split pargraph\n",
    "    df = [d.split('\\n') for d in df] # tokenize\n",
    "    df = [[(w.split('\\t')[0], w.split('\\t')[-1]) for w in d] for d in df] # split word and tag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# simplify tags to match with NER dataset tags\n",
    "def tag_edit(input):\n",
    "    if 'group' in input:\n",
    "        return re.sub('group', 'org', input)\n",
    "    elif  'corporation' in input:\n",
    "        return re.sub('corporation', 'org', input)\n",
    "    elif 'location' in input:\n",
    "        return re.sub('location', 'geo', input)\n",
    "    elif 'product' in input:\n",
    "        return re.sub('product', 'art', input)\n",
    "    elif 'creative-work' in input:\n",
    "        return re.sub('creative-work', 'art', input)\n",
    "    elif 'person' in input:\n",
    "        return re.sub('person', 'per', input)\n",
    "    return input\n",
    "df = [[(w[0], tag_edit(w[-1])) for w in d] for d in df]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# concatenate sentences\n",
    "written = [[' '.join([w[0] for w in d]), ' '.join([w[-1] for w in d])] for d in df]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save words and tags\n",
    "with open('wnut17train_conll_text.txt', 'w') as file:\n",
    "    file.writelines('\\n'.join([d[0] for d in written]))\n",
    "with open('wnut17train_conll_target.txt', 'w') as file:\n",
    "    file.writelines('\\n'.join([d[-1] for d in written]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# retroeve wprds amd tags\n",
    "text = ' '.join([w[0] for w in written])\n",
    "text = text.split()\n",
    "words.extend(list(set(text)))\n",
    "\n",
    "text = ' '.join([w[-1] for w in written])\n",
    "text = text.split()\n",
    "tags.extend(list(set(text)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# update set of words and tags\n",
    "words = list(set(words))\n",
    "tags = list(set(tags))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save words and tags\n",
    "with open('words.txt', 'w') as file:\n",
    "    file.writelines('\\n'.join(words))\n",
    "with open('tags.txt', 'w') as file:\n",
    "    file.writelines('\\n'.join(tags))    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
